This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
df = read.csv("Traffic_Crashes_250206.csv")
df_filtered = df %>%
filter(MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT")
df_filtered$MOST_SEVERE_INJURY <- factor(df_filtered$MOST_SEVERE_INJURY,
levels = c("NO INDICATION OF INJURY",
"NONINCAPACITATING INJURY",
"INCAPACITATING INJURY",
"FATAL"),
ordered = TRUE)
API_key <- "24f32467-3038-4dc5-bde1-a3a7806ec34e"
register_stadiamaps(API_key)
map <- get_stadiamap(c(left = -87.8, bottom = 41.7,
right = -87.6, top = 42), zoom = 12,
maptype = "stamen_toner_lite")
ggmap(map)
df_sampled = df_filtered[sample(1000),]
weather = df_sampled%>%
filter(WEATHER_CONDITION != "")
ggmap(map) +
geom_point(data = weather,
aes(x = LONGITUDE, y = LATITUDE, color = WEATHER_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))
df_sampled = df_filtered[sample(1000),]
weather = df_sampled%>%
filter(WEATHER_CONDITION != "")
bad_weather = weather%>%
filter(WEATHER_CONDITION != "CLEAR")
ggmap(map) +
geom_point(data = bad_weather,
aes(x = LONGITUDE, y = LATITUDE, color = WEATHER_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))
Rain causes the most severe injury, and most likely to be the incapacitating injury.
df_sampled = df_filtered[sample(1000),]
lighting = df_sampled%>%
filter(LIGHTING_CONDITION != "")
ggmap(map) +
geom_point(data = lighting,
aes(x = LONGITUDE, y = LATITUDE, color = LIGHTING_CONDITION,size = MOST_SEVERE_INJURY, alpha = 0.7))
df_sampled = df_filtered[sample(1000),]
raodcondi = df_sampled%>%
filter(ROADWAY_SURFACE_COND != "")
ggmap(map) +
geom_point(data = raodcondi,
aes(x = LONGITUDE, y = LATITUDE, color = ROADWAY_SURFACE_COND,size = MOST_SEVERE_INJURY, alpha = 0.7))
df_sampled = df_filtered[sample(1000),]
bad_raodcondi = df_sampled%>%
filter(ROADWAY_SURFACE_COND != "DRY")
ggmap(map) +
geom_point(data = bad_raodcondi,
aes(x = LONGITUDE, y = LATITUDE, color = ROADWAY_SURFACE_COND,size = MOST_SEVERE_INJURY, alpha = 0.7))
Primary cause relate to severity
df_sampled = df_filtered[sample(1000),]
prim_cause = df_sampled%>%
filter(PRIM_CONTRIBUTORY_CAUSE != "")
ggmap(map) +
geom_point(data = prim_cause,
aes(x = LONGITUDE, y = LATITUDE, color = PRIM_CONTRIBUTORY_CAUSE,size = MOST_SEVERE_INJURY, alpha = 0.7))+
theme(legend.text = element_text(size = 4), # Reduce text size
legend.title = element_text(size = 5), # Smaller title
legend.key.size = unit(0.25, "cm")) # Reduce legend box size
Failing to reduce speed, failing to yield, following too closely are
causing more severe cases.
Does speed limit influence severety?
df_sampled = df_filtered[sample(3000),]
df_sampled = df_sampled %>%
filter(MOST_SEVERE_INJURY != "NO INDICATION OF INJURY")
speed = df_sampled%>%
filter(POSTED_SPEED_LIMIT != "")
ggmap(map) +
geom_point(data = speed,
aes(x = LONGITUDE, y = LATITUDE, color = MOST_SEVERE_INJURY,size = POSTED_SPEED_LIMIT, alpha = 0.7))
More severe accidents are in higher speed limit roads
Relationship with hit and run
df_sampled = df_filtered[sample(4000),]
hitrun = df_sampled%>%
filter(HIT_AND_RUN_I != "")
ggmap(map) +
geom_point(data = hitrun,
aes(x = LONGITUDE, y = LATITUDE, color = HIT_AND_RUN_I,size = MOST_SEVERE_INJURY, alpha = 0.7))
There are no hit and run for fatal injury
df_recent = read.csv("Traffic_Crashes_250206.csv")
# Removes rows where MOST_SEVERE_INJURY is exactly ""
df <- df_recent[df_recent$MOST_SEVERE_INJURY != "", ]
df <- df[df$MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT", ]
table(df$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 851 11480 482399
## NONINCAPACITATING INJURY
## 53008
library(dplyr)
# 1. Define your majority classes
majority_classes <- c(
"NO INDICATION OF INJURY",
"NONINCAPACITATING INJURY",
"INCAPACITATING INJURY"
)
# 2. Split the data into 'majority' and 'minority' subsets
df_majority <- df %>%
filter(MOST_SEVERE_INJURY %in% majority_classes)
df_minority <- df %>%
filter(!MOST_SEVERE_INJURY %in% majority_classes)
# i.e., "FATAL" + "INCAPACITATING INJURY"
# 3. Undersample each majority class to a chosen size
# Adjust 'target_size' to suit your needs.
target_size <- 2000
set.seed(123) # for reproducibility
df_majority_undersampled <- df_majority %>%
group_by(MOST_SEVERE_INJURY) %>%
sample_n(size = target_size, replace = FALSE) %>%
ungroup()
# 4. Combine the minority subset (kept intact) with the undersampled majority
df_undersampled <- bind_rows(df_minority, df_majority_undersampled)
# 5. Check new distribution
table(df_undersampled$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 851 2000 2000
## NONINCAPACITATING INJURY
## 2000
library(glmnet)
df_undersampled <- df_undersampled %>% drop_na() # Remove rows with NA values
# Build the design matrix (X) and outcome (y)
X <- model.matrix(MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE, data = df_undersampled)[, -1]
y <- df_undersampled$MOST_SEVERE_INJURY
# Perform cross-validation for multinomial logistic LASSO
cvfit <- cv.glmnet(
x = X,
y = y,
family = "multinomial", # for multi-class
type.multinomial = "grouped", # treats coefficients of each class as a group
alpha = 1, # alpha=1 => LASSO penalty
nfolds = 5 # 5-fold cross-validation (adjust as needed)
)
# Plot cross-validation curves
plot(cvfit)
# Identify best lambda
best_lambda <- cvfit$lambda.1se
best_lambda
## [1] 0.0261
classnames <- cvfit$glmnet.fit$classnames
y <- factor(y, levels = classnames) # Ensure correct class alignment
# Refit the final model at best lambda
final_model <- glmnet(
x = X,
y = y,
family = "multinomial",
alpha = 1,
lambda = best_lambda
)
# Extract coefficients at best lambda
coef_matrix <- coef(final_model, s = best_lambda)
# Convert to a readable format
coef_list <- lapply(coef_matrix, function(m) as.matrix(m))
# Function to extract only non-zero coefficients
extract_nonzero_coefs <- function(coef_matrix) {
non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE] # Keep only non-zero coefficients
return(rownames(non_zero_coefs)) # Return feature names
}
# Get non-zero features for each injury class
important_features <- lapply(coef_list, extract_nonzero_coefs)
# Print only the selected features (without zero coefficients)
important_features
## $FATAL
## [1] "(Intercept)"
## [2] "TRAFFIC_CONTROL_DEVICESCHOOL ZONE"
## [3] "LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD"
## [4] "LIGHTING_CONDITIONDAYLIGHT"
## [5] "FIRST_CRASH_TYPEFIXED OBJECT"
## [6] "FIRST_CRASH_TYPEPEDESTRIAN"
## [7] "FIRST_CRASH_TYPEREAR END"
## [8] "PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT"
## [9] "PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER"
## [10] "LATITUDE"
##
## $`INCAPACITATING INJURY`
## [1] "(Intercept)"
## [2] "DEVICE_CONDITIONOTHER"
## [3] "LIGHTING_CONDITIONDAYLIGHT"
## [4] "TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER"
## [5] "TRAFFICWAY_TYPERAMP"
## [6] "INTERSECTION_RELATED_IY"
##
## $`NO INDICATION OF INJURY`
## [1] "(Intercept)"
## [2] "POSTED_SPEED_LIMIT"
## [3] "LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD"
## [4] "LIGHTING_CONDITIONUNKNOWN"
## [5] "FIRST_CRASH_TYPEFIXED OBJECT"
## [6] "FIRST_CRASH_TYPEHEAD ON"
## [7] "FIRST_CRASH_TYPEPARKED MOTOR VEHICLE"
## [8] "FIRST_CRASH_TYPEPEDALCYCLIST"
## [9] "FIRST_CRASH_TYPEPEDESTRIAN"
## [10] "FIRST_CRASH_TYPESIDESWIPE SAME DIRECTION"
## [11] "TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER"
## [12] "TRAFFICWAY_TYPEONE-WAY"
## [13] "TRAFFICWAY_TYPEPARKING LOT"
## [14] "INTERSECTION_RELATED_IY"
## [15] "HIT_AND_RUN_IY"
## [16] "PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS"
## [17] "PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT"
## [18] "PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY"
## [19] "PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING"
## [20] "PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING"
## [21] "PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER"
## [22] "PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED"
## [23] "PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)"
## [24] "LATITUDE"
##
## $`NONINCAPACITATING INJURY`
## [1] "(Intercept)" "FIRST_CRASH_TYPETURNING"
###logestic regression
# Corrected feature names based on dataset
selected_features <- c(
"POSTED_SPEED_LIMIT",
"LIGHTING_CONDITION",
"PRIM_CONTRIBUTORY_CAUSE",
"DEVICE_CONDITION",
"TRAFFICWAY_TYPE",
"INTERSECTION_RELATED_I"
)
# Select only these features from df_undersampled
df_final <- df_undersampled %>%
select(all_of(selected_features), MOST_SEVERE_INJURY) %>%
drop_na() # Remove any missing values
library(nnet)
library(caret)
# Ensure MOST_SEVERE_INJURY is a factor
df_final$MOST_SEVERE_INJURY <- as.factor(df_final$MOST_SEVERE_INJURY)
# Split into train and test sets
set.seed(42)
train_index <- createDataPartition(df_final$MOST_SEVERE_INJURY, p = 0.8, list = FALSE)
train_data <- df_final[train_index, ]
test_data <- df_final[-train_index, ]
# Train multinomial logistic regression model
logistic_model <- multinom(MOST_SEVERE_INJURY ~ ., data = train_data)
## # weights: 220 (162 variable)
## initial value 1465.313140
## iter 10 value 1306.466809
## iter 20 value 1213.310229
## iter 30 value 1203.513573
## iter 40 value 1198.462365
## iter 50 value 1196.637961
## iter 60 value 1195.891750
## iter 70 value 1195.598511
## iter 80 value 1195.477049
## iter 90 value 1195.449429
## iter 100 value 1195.431957
## final value 1195.431957
## stopped after 100 iterations
# Summary of model
summary(logistic_model)
## Call:
## multinom(formula = MOST_SEVERE_INJURY ~ ., data = train_data)
##
## Coefficients:
## (Intercept) POSTED_SPEED_LIMIT
## INCAPACITATING INJURY 2.16 -0.0107
## NO INDICATION OF INJURY -6.75 -0.0303
## NONINCAPACITATING INJURY 7.36 0.0354
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD
## INCAPACITATING INJURY -0.806
## NO INDICATION OF INJURY -1.116
## NONINCAPACITATING INJURY -0.270
## LIGHTING_CONDITIONDAWN LIGHTING_CONDITIONDAYLIGHT
## INCAPACITATING INJURY -1.15 -0.0235
## NO INDICATION OF INJURY -1.11 -0.3311
## NONINCAPACITATING INJURY -1.28 -0.0825
## LIGHTING_CONDITIONDUSK LIGHTING_CONDITIONUNKNOWN
## INCAPACITATING INJURY 0.0982 -1.299
## NO INDICATION OF INJURY 0.4224 0.495
## NONINCAPACITATING INJURY 0.3986 -1.676
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING OTHER TRAFFIC SIGNS
## INCAPACITATING INJURY -0.776
## NO INDICATION OF INJURY -22.688
## NONINCAPACITATING INJURY -28.779
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING ROAD MARKINGS
## INCAPACITATING INJURY 17.10
## NO INDICATION OF INJURY -4.90
## NONINCAPACITATING INJURY -6.83
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING STOP SIGN
## INCAPACITATING INJURY 1.31
## NO INDICATION OF INJURY 1.01
## NONINCAPACITATING INJURY -6.45
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS
## INCAPACITATING INJURY -0.103
## NO INDICATION OF INJURY -1.013
## NONINCAPACITATING INJURY -7.263
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING YIELD SIGN
## INCAPACITATING INJURY 15.94
## NO INDICATION OF INJURY -4.81
## NONINCAPACITATING INJURY -7.36
## PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM INSIDE VEHICLE
## INCAPACITATING INJURY -1.21
## NO INDICATION OF INJURY -21.48
## NONINCAPACITATING INJURY -7.66
## PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM OUTSIDE VEHICLE
## INCAPACITATING INJURY -0.370
## NO INDICATION OF INJURY -0.483
## NONINCAPACITATING INJURY -8.212
## PRIM_CONTRIBUTORY_CAUSEDRIVING ON WRONG SIDE/WRONG WAY
## INCAPACITATING INJURY -0.346
## NO INDICATION OF INJURY -1.859
## NONINCAPACITATING INJURY -8.666
## PRIM_CONTRIBUTORY_CAUSEDRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## INCAPACITATING INJURY 19.3
## NO INDICATION OF INJURY 19.9
## NONINCAPACITATING INJURY 11.8
## PRIM_CONTRIBUTORY_CAUSEEQUIPMENT - VEHICLE CONDITION
## INCAPACITATING INJURY 13.55
## NO INDICATION OF INJURY 15.75
## NONINCAPACITATING INJURY 7.57
## PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT
## INCAPACITATING INJURY -1.52
## NO INDICATION OF INJURY -3.18
## NONINCAPACITATING INJURY -9.31
## PRIM_CONTRIBUTORY_CAUSEEXCEEDING SAFE SPEED FOR CONDITIONS
## INCAPACITATING INJURY -0.178
## NO INDICATION OF INJURY 0.800
## NONINCAPACITATING INJURY -8.992
## PRIM_CONTRIBUTORY_CAUSEFAILING TO REDUCE SPEED TO AVOID CRASH
## INCAPACITATING INJURY -0.06857
## NO INDICATION OF INJURY 0.00273
## NONINCAPACITATING INJURY -7.12365
## PRIM_CONTRIBUTORY_CAUSEFAILING TO YIELD RIGHT-OF-WAY
## INCAPACITATING INJURY 0.702
## NO INDICATION OF INJURY 1.111
## NONINCAPACITATING INJURY -6.450
## PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY
## INCAPACITATING INJURY 1.65
## NO INDICATION OF INJURY 2.95
## NONINCAPACITATING INJURY -4.96
## PRIM_CONTRIBUTORY_CAUSEHAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## INCAPACITATING INJURY 15.62
## NO INDICATION OF INJURY -7.77
## NONINCAPACITATING INJURY 8.49
## PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING
## INCAPACITATING INJURY 13.7
## NO INDICATION OF INJURY 16.8
## NONINCAPACITATING INJURY 7.5
## PRIM_CONTRIBUTORY_CAUSEIMPROPER LANE USAGE
## INCAPACITATING INJURY 0.814
## NO INDICATION OF INJURY 2.419
## NONINCAPACITATING INJURY -6.069
## PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING
## INCAPACITATING INJURY 1.13
## NO INDICATION OF INJURY 2.50
## NONINCAPACITATING INJURY -6.37
## PRIM_CONTRIBUTORY_CAUSEIMPROPER TURNING/NO SIGNAL
## INCAPACITATING INJURY 0.813
## NO INDICATION OF INJURY 1.950
## NONINCAPACITATING INJURY -6.320
## PRIM_CONTRIBUTORY_CAUSENOT APPLICABLE
## INCAPACITATING INJURY -0.810
## NO INDICATION OF INJURY -0.406
## NONINCAPACITATING INJURY -8.296
## PRIM_CONTRIBUTORY_CAUSEOPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## INCAPACITATING INJURY -0.345
## NO INDICATION OF INJURY -1.275
## NONINCAPACITATING INJURY -7.780
## PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER
## INCAPACITATING INJURY -1.15
## NO INDICATION OF INJURY -2.45
## NONINCAPACITATING INJURY -8.38
## PRIM_CONTRIBUTORY_CAUSEROAD ENGINEERING/SURFACE/MARKING DEFECTS
## INCAPACITATING INJURY -6.18
## NO INDICATION OF INJURY 17.57
## NONINCAPACITATING INJURY -6.93
## PRIM_CONTRIBUTORY_CAUSETEXTING
## INCAPACITATING INJURY -6.09
## NO INDICATION OF INJURY -5.12
## NONINCAPACITATING INJURY 15.73
## PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED
## INCAPACITATING INJURY -6.87
## NO INDICATION OF INJURY 18.44
## NONINCAPACITATING INJURY -6.82
## PRIM_CONTRIBUTORY_CAUSEUNABLE TO DETERMINE
## INCAPACITATING INJURY -0.373
## NO INDICATION OF INJURY 0.224
## NONINCAPACITATING INJURY -7.761
## PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## INCAPACITATING INJURY -0.181
## NO INDICATION OF INJURY -17.670
## NONINCAPACITATING INJURY -7.683
## PRIM_CONTRIBUTORY_CAUSEVISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## INCAPACITATING INJURY -0.188
## NO INDICATION OF INJURY -0.440
## NONINCAPACITATING INJURY -7.760
## PRIM_CONTRIBUTORY_CAUSEWEATHER
## INCAPACITATING INJURY 17.9
## NO INDICATION OF INJURY 18.5
## NONINCAPACITATING INJURY 11.5
## DEVICE_CONDITIONFUNCTIONING PROPERLY
## INCAPACITATING INJURY -0.0377
## NO INDICATION OF INJURY 9.2468
## NONINCAPACITATING INJURY -0.3784
## DEVICE_CONDITIONNO CONTROLS
## INCAPACITATING INJURY -0.23
## NO INDICATION OF INJURY 8.79
## NONINCAPACITATING INJURY -0.57
## DEVICE_CONDITIONNOT FUNCTIONING DEVICE_CONDITIONOTHER
## INCAPACITATING INJURY -0.191 16.1
## NO INDICATION OF INJURY 8.511 22.4
## NONINCAPACITATING INJURY -21.656 15.1
## DEVICE_CONDITIONUNKNOWN
## INCAPACITATING INJURY 0.0689
## NO INDICATION OF INJURY 8.9131
## NONINCAPACITATING INJURY -0.1265
## TRAFFICWAY_TYPECENTER TURN LANE
## INCAPACITATING INJURY -0.926
## NO INDICATION OF INJURY -1.799
## NONINCAPACITATING INJURY -0.557
## TRAFFICWAY_TYPEDIVIDED - W/MEDIAN (NOT RAISED)
## INCAPACITATING INJURY -0.557
## NO INDICATION OF INJURY -0.198
## NONINCAPACITATING INJURY 0.407
## TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER
## INCAPACITATING INJURY -0.136
## NO INDICATION OF INJURY -0.746
## NONINCAPACITATING INJURY -0.365
## TRAFFICWAY_TYPEDRIVEWAY TRAFFICWAY_TYPENOT DIVIDED
## INCAPACITATING INJURY -12.1 -0.393
## NO INDICATION OF INJURY 14.3 0.198
## NONINCAPACITATING INJURY 15.0 0.523
## TRAFFICWAY_TYPEONE-WAY TRAFFICWAY_TYPEOTHER
## INCAPACITATING INJURY -0.7291 0.1597
## NO INDICATION OF INJURY 0.5823 -0.0841
## NONINCAPACITATING INJURY 0.0056 0.5888
## TRAFFICWAY_TYPEPARKING LOT TRAFFICWAY_TYPERAMP
## INCAPACITATING INJURY -0.415 22.55
## NO INDICATION OF INJURY 1.811 -3.97
## NONINCAPACITATING INJURY 1.394 -4.88
## TRAFFICWAY_TYPEUNKNOWN INTERSECTION_RELATED_IN
## INCAPACITATING INJURY 0.229 -1.667
## NO INDICATION OF INJURY -0.453 -1.101
## NONINCAPACITATING INJURY 1.530 -0.738
## INTERSECTION_RELATED_IY
## INCAPACITATING INJURY 0.202
## NO INDICATION OF INJURY -0.698
## NONINCAPACITATING INJURY 0.123
##
## Std. Errors:
## (Intercept) POSTED_SPEED_LIMIT
## INCAPACITATING INJURY 1.94 0.0249
## NO INDICATION OF INJURY 1.22 0.0250
## NONINCAPACITATING INJURY 2.20 0.0276
## LIGHTING_CONDITIONDARKNESS, LIGHTED ROAD
## INCAPACITATING INJURY 0.658
## NO INDICATION OF INJURY 0.679
## NONINCAPACITATING INJURY 0.675
## LIGHTING_CONDITIONDAWN LIGHTING_CONDITIONDAYLIGHT
## INCAPACITATING INJURY 0.875 0.656
## NO INDICATION OF INJURY 0.898 0.675
## NONINCAPACITATING INJURY 0.918 0.677
## LIGHTING_CONDITIONDUSK LIGHTING_CONDITIONUNKNOWN
## INCAPACITATING INJURY 1.04 1.21
## NO INDICATION OF INJURY 1.04 1.05
## NONINCAPACITATING INJURY 1.05 1.37
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING OTHER TRAFFIC SIGNS
## INCAPACITATING INJURY 1.26e+00
## NO INDICATION OF INJURY 1.47e-09
## NONINCAPACITATING INJURY 1.35e-08
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING ROAD MARKINGS
## INCAPACITATING INJURY 7.32e-08
## NO INDICATION OF INJURY 3.30e-10
## NONINCAPACITATING INJURY 7.28e-08
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING STOP SIGN
## INCAPACITATING INJURY 1.03
## NO INDICATION OF INJURY 1.09
## NONINCAPACITATING INJURY 1.05
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING TRAFFIC SIGNALS
## INCAPACITATING INJURY 0.484
## NO INDICATION OF INJURY 0.653
## NONINCAPACITATING INJURY 0.487
## PRIM_CONTRIBUTORY_CAUSEDISREGARDING YIELD SIGN
## INCAPACITATING INJURY 7.16e-08
## NO INDICATION OF INJURY 5.16e-10
## NONINCAPACITATING INJURY 6.23e-08
## PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM INSIDE VEHICLE
## INCAPACITATING INJURY 9.19e-01
## NO INDICATION OF INJURY 9.69e-09
## NONINCAPACITATING INJURY 8.00e-01
## PRIM_CONTRIBUTORY_CAUSEDISTRACTION - FROM OUTSIDE VEHICLE
## INCAPACITATING INJURY 0.870
## NO INDICATION OF INJURY 0.959
## NONINCAPACITATING INJURY 1.002
## PRIM_CONTRIBUTORY_CAUSEDRIVING ON WRONG SIDE/WRONG WAY
## INCAPACITATING INJURY 0.619
## NO INDICATION OF INJURY 1.077
## NONINCAPACITATING INJURY 0.743
## PRIM_CONTRIBUTORY_CAUSEDRIVING SKILLS/KNOWLEDGE/EXPERIENCE
## INCAPACITATING INJURY 0.292
## NO INDICATION OF INJURY 0.283
## NONINCAPACITATING INJURY 0.318
## PRIM_CONTRIBUTORY_CAUSEEQUIPMENT - VEHICLE CONDITION
## INCAPACITATING INJURY 0.795
## NO INDICATION OF INJURY 0.641
## NONINCAPACITATING INJURY 0.620
## PRIM_CONTRIBUTORY_CAUSEEXCEEDING AUTHORIZED SPEED LIMIT
## INCAPACITATING INJURY 0.488
## NO INDICATION OF INJURY 1.016
## NONINCAPACITATING INJURY 0.589
## PRIM_CONTRIBUTORY_CAUSEEXCEEDING SAFE SPEED FOR CONDITIONS
## INCAPACITATING INJURY 0.854
## NO INDICATION OF INJURY 0.827
## NONINCAPACITATING INJURY 1.204
## PRIM_CONTRIBUTORY_CAUSEFAILING TO REDUCE SPEED TO AVOID CRASH
## INCAPACITATING INJURY 0.432
## NO INDICATION OF INJURY 0.458
## NONINCAPACITATING INJURY 0.427
## PRIM_CONTRIBUTORY_CAUSEFAILING TO YIELD RIGHT-OF-WAY
## INCAPACITATING INJURY 0.411
## NO INDICATION OF INJURY 0.419
## NONINCAPACITATING INJURY 0.415
## PRIM_CONTRIBUTORY_CAUSEFOLLOWING TOO CLOSELY
## INCAPACITATING INJURY 1.014
## NO INDICATION OF INJURY 0.992
## NONINCAPACITATING INJURY 1.000
## PRIM_CONTRIBUTORY_CAUSEHAD BEEN DRINKING (USE WHEN ARREST IS NOT MADE)
## INCAPACITATING INJURY 7.21e-01
## NO INDICATION OF INJURY 4.55e-11
## NONINCAPACITATING INJURY 7.21e-01
## PRIM_CONTRIBUTORY_CAUSEIMPROPER BACKING
## INCAPACITATING INJURY 0.712
## NO INDICATION OF INJURY 0.474
## NONINCAPACITATING INJURY 0.580
## PRIM_CONTRIBUTORY_CAUSEIMPROPER LANE USAGE
## INCAPACITATING INJURY 1.06
## NO INDICATION OF INJURY 1.01
## NONINCAPACITATING INJURY 1.05
## PRIM_CONTRIBUTORY_CAUSEIMPROPER OVERTAKING/PASSING
## INCAPACITATING INJURY 1.04
## NO INDICATION OF INJURY 1.01
## NONINCAPACITATING INJURY 1.06
## PRIM_CONTRIBUTORY_CAUSEIMPROPER TURNING/NO SIGNAL
## INCAPACITATING INJURY 0.758
## NO INDICATION OF INJURY 0.745
## NONINCAPACITATING INJURY 0.759
## PRIM_CONTRIBUTORY_CAUSENOT APPLICABLE
## INCAPACITATING INJURY 0.485
## NO INDICATION OF INJURY 0.475
## NONINCAPACITATING INJURY 0.518
## PRIM_CONTRIBUTORY_CAUSEOPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER
## INCAPACITATING INJURY 0.642
## NO INDICATION OF INJURY 0.782
## NONINCAPACITATING INJURY 0.667
## PRIM_CONTRIBUTORY_CAUSEPHYSICAL CONDITION OF DRIVER
## INCAPACITATING INJURY 0.449
## NO INDICATION OF INJURY 0.665
## NONINCAPACITATING INJURY 0.456
## PRIM_CONTRIBUTORY_CAUSEROAD ENGINEERING/SURFACE/MARKING DEFECTS
## INCAPACITATING INJURY NaN
## NO INDICATION OF INJURY 4.92e-08
## NONINCAPACITATING INJURY 6.64e-08
## PRIM_CONTRIBUTORY_CAUSETEXTING
## INCAPACITATING INJURY 3.52e-11
## NO INDICATION OF INJURY 5.62e-13
## NONINCAPACITATING INJURY 1.63e-10
## PRIM_CONTRIBUTORY_CAUSETURNING RIGHT ON RED
## INCAPACITATING INJURY 8.33e-11
## NO INDICATION OF INJURY 4.00e-08
## NONINCAPACITATING INJURY 5.05e-08
## PRIM_CONTRIBUTORY_CAUSEUNABLE TO DETERMINE
## INCAPACITATING INJURY 0.285
## NO INDICATION OF INJURY 0.283
## NONINCAPACITATING INJURY 0.292
## PRIM_CONTRIBUTORY_CAUSEUNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN ARREST IS EFFECTED)
## INCAPACITATING INJURY 6.08e-01
## NO INDICATION OF INJURY 2.72e-07
## NONINCAPACITATING INJURY 6.23e-01
## PRIM_CONTRIBUTORY_CAUSEVISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)
## INCAPACITATING INJURY 1.14
## NO INDICATION OF INJURY 1.36
## NONINCAPACITATING INJURY 1.19
## PRIM_CONTRIBUTORY_CAUSEWEATHER
## INCAPACITATING INJURY 0.415
## NO INDICATION OF INJURY 0.413
## NONINCAPACITATING INJURY 0.380
## DEVICE_CONDITIONFUNCTIONING PROPERLY
## INCAPACITATING INJURY 1.41
## NO INDICATION OF INJURY 0.49
## NONINCAPACITATING INJURY 1.45
## DEVICE_CONDITIONNO CONTROLS
## INCAPACITATING INJURY 1.43
## NO INDICATION OF INJURY 0.47
## NONINCAPACITATING INJURY 1.47
## DEVICE_CONDITIONNOT FUNCTIONING DEVICE_CONDITIONOTHER
## INCAPACITATING INJURY 1.90e+00 0.785
## NO INDICATION OF INJURY 1.17e+00 1.084
## NONINCAPACITATING INJURY 3.68e-08 0.831
## DEVICE_CONDITIONUNKNOWN
## INCAPACITATING INJURY 1.500
## NO INDICATION OF INJURY 0.591
## NONINCAPACITATING INJURY 1.538
## TRAFFICWAY_TYPECENTER TURN LANE
## INCAPACITATING INJURY 1.40
## NO INDICATION OF INJURY 1.62
## NONINCAPACITATING INJURY 1.70
## TRAFFICWAY_TYPEDIVIDED - W/MEDIAN (NOT RAISED)
## INCAPACITATING INJURY 1.26
## NO INDICATION OF INJURY 1.35
## NONINCAPACITATING INJURY 1.57
## TRAFFICWAY_TYPEDIVIDED - W/MEDIAN BARRIER
## INCAPACITATING INJURY 1.29
## NO INDICATION OF INJURY 1.39
## NONINCAPACITATING INJURY 1.61
## TRAFFICWAY_TYPEDRIVEWAY TRAFFICWAY_TYPENOT DIVIDED
## INCAPACITATING INJURY 7.69e-11 1.24
## NO INDICATION OF INJURY 8.80e-01 1.33
## NONINCAPACITATING INJURY 8.80e-01 1.56
## TRAFFICWAY_TYPEONE-WAY TRAFFICWAY_TYPEOTHER
## INCAPACITATING INJURY 1.26 1.40
## NO INDICATION OF INJURY 1.34 1.53
## NONINCAPACITATING INJURY 1.58 1.71
## TRAFFICWAY_TYPEPARKING LOT TRAFFICWAY_TYPERAMP
## INCAPACITATING INJURY 1.73 3.31e-08
## NO INDICATION OF INJURY 1.69 2.90e-11
## NONINCAPACITATING INJURY 1.93 3.63e-11
## TRAFFICWAY_TYPEUNKNOWN INTERSECTION_RELATED_IN
## INCAPACITATING INJURY 1.98 1.203
## NO INDICATION OF INJURY 1.90 0.897
## NONINCAPACITATING INJURY 2.24 0.853
## INTERSECTION_RELATED_IY
## INCAPACITATING INJURY 0.308
## NO INDICATION OF INJURY 0.327
## NONINCAPACITATING INJURY 0.315
##
## Residual Deviance: 2391
## AIC: 2715
# Ensure categorical variables in test_data use the same levels as in train_data
for (col in colnames(train_data)) {
if (is.factor(train_data[[col]])) {
test_data[[col]] <- factor(test_data[[col]], levels = levels(train_data[[col]]))
}
}
# Make predictions
predictions <- predict(logistic_model, newdata = test_data)
# Confusion matrix
conf_matrix <- confusionMatrix(predictions, test_data$MOST_SEVERE_INJURY)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## FATAL 7 4 1
## INCAPACITATING INJURY 13 30 8
## NO INDICATION OF INJURY 10 22 62
## NONINCAPACITATING INJURY 3 22 11
## Reference
## Prediction NONINCAPACITATING INJURY
## FATAL 2
## INCAPACITATING INJURY 31
## NO INDICATION OF INJURY 21
## NONINCAPACITATING INJURY 15
##
## Overall Statistics
##
## Accuracy : 0.435
## 95% CI : (0.374, 0.498)
## No Information Rate : 0.313
## P-Value [Acc > NIR] : 2.17e-05
##
## Kappa : 0.206
##
## Mcnemar's Test P-Value : 0.000641
##
## Statistics by Class:
##
## Class: FATAL Class: INCAPACITATING INJURY
## Sensitivity 0.2121 0.385
## Specificity 0.9694 0.717
## Pos Pred Value 0.5000 0.366
## Neg Pred Value 0.8952 0.733
## Prevalence 0.1260 0.298
## Detection Rate 0.0267 0.115
## Detection Prevalence 0.0534 0.313
## Balanced Accuracy 0.5908 0.551
## Class: NO INDICATION OF INJURY
## Sensitivity 0.756
## Specificity 0.706
## Pos Pred Value 0.539
## Neg Pred Value 0.864
## Prevalence 0.313
## Detection Rate 0.237
## Detection Prevalence 0.439
## Balanced Accuracy 0.731
## Class: NONINCAPACITATING INJURY
## Sensitivity 0.2174
## Specificity 0.8135
## Pos Pred Value 0.2941
## Neg Pred Value 0.7441
## Prevalence 0.2634
## Detection Rate 0.0573
## Detection Prevalence 0.1947
## Balanced Accuracy 0.5154
# Accuracy
accuracy <- mean(predictions == test_data$MOST_SEVERE_INJURY)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 43.51 %"
###Random Forest
df_recent = read.csv("Traffic_Crashes_250206.csv")
# Removes rows where MOST_SEVERE_INJURY is exactly ""
df <- df_recent[df_recent$MOST_SEVERE_INJURY != "", ]
df <- df[df$MOST_SEVERE_INJURY != "REPORTED, NOT EVIDENT", ]
table(df$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 851 11480 482399
## NONINCAPACITATING INJURY
## 53008
library(dplyr)
# 1. Define your majority classes
majority_classes <- c(
"NO INDICATION OF INJURY",
"NONINCAPACITATING INJURY",
"INCAPACITATING INJURY"
)
# 2. Split the data into 'majority' and 'minority' subsets
df_majority <- df %>%
filter(MOST_SEVERE_INJURY %in% majority_classes)
df_minority <- df %>%
filter(!MOST_SEVERE_INJURY %in% majority_classes)
# i.e., "FATAL" + "INCAPACITATING INJURY"
# 3. Undersample each majority class to a chosen size
# Adjust 'target_size' to suit your needs.
target_size <- 2000
set.seed(123) # for reproducibility
df_majority_undersampled <- df_majority %>%
group_by(MOST_SEVERE_INJURY) %>%
sample_n(size = target_size, replace = FALSE) %>%
ungroup()
# 4. Combine the minority subset (kept intact) with the undersampled majority
df_undersampled <- bind_rows(df_minority, df_majority_undersampled)
# 5. Check new distribution
table(df_undersampled$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 851 2000 2000
## NONINCAPACITATING INJURY
## 2000
library(glmnet)
df_undersampled$MOST_SEVERE_INJURY <- as.factor(df_undersampled$MOST_SEVERE_INJURY)
colSums(is.na(df_undersampled))
## CRASH_RECORD_ID CRASH_DATE_EST_I
## 0 0
## CRASH_DATE POSTED_SPEED_LIMIT
## 0 0
## TRAFFIC_CONTROL_DEVICE DEVICE_CONDITION
## 0 0
## WEATHER_CONDITION LIGHTING_CONDITION
## 0 0
## FIRST_CRASH_TYPE TRAFFICWAY_TYPE
## 0 0
## LANE_CNT ALIGNMENT
## 5526 0
## ROADWAY_SURFACE_COND ROAD_DEFECT
## 0 0
## REPORT_TYPE CRASH_TYPE
## 0 0
## INTERSECTION_RELATED_I NOT_RIGHT_OF_WAY_I
## 0 0
## HIT_AND_RUN_I DAMAGE
## 0 0
## DATE_POLICE_NOTIFIED PRIM_CONTRIBUTORY_CAUSE
## 0 0
## SEC_CONTRIBUTORY_CAUSE STREET_NO
## 0 0
## STREET_DIRECTION STREET_NAME
## 0 0
## BEAT_OF_OCCURRENCE PHOTOS_TAKEN_I
## 0 0
## STATEMENTS_TAKEN_I DOORING_I
## 0 0
## WORK_ZONE_I WORK_ZONE_TYPE
## 0 0
## WORKERS_PRESENT_I NUM_UNITS
## 0 0
## MOST_SEVERE_INJURY INJURIES_TOTAL
## 0 0
## INJURIES_FATAL INJURIES_INCAPACITATING
## 0 0
## INJURIES_NON_INCAPACITATING INJURIES_REPORTED_NOT_EVIDENT
## 0 0
## INJURIES_NO_INDICATION INJURIES_UNKNOWN
## 0 0
## CRASH_HOUR CRASH_DAY_OF_WEEK
## 0 0
## CRASH_MONTH LATITUDE
## 0 40
## LONGITUDE LOCATION
## 40 0
df_undersampled <- as.data.frame(df_undersampled)
df_undersampled <- df_undersampled %>% na.omit()
# Convert categorical variables into factors
df_undersampled$MOST_SEVERE_INJURY <- as.factor(df_undersampled$MOST_SEVERE_INJURY)
# Create X matrix AFTER removing NAs
X <- model.matrix(MOST_SEVERE_INJURY~POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE, data = df_undersampled)[, -1]
# Ensure y matches X in row count
y <- df_undersampled$MOST_SEVERE_INJURY
# Check if X and y have the same number of rows
nrow(X) == length(y) # Should return TRUE
## [1] TRUE
# Perform cross-validation for multinomial logistic LASSO
cvfit <- cv.glmnet(
x = X,
y = y,
family = "multinomial", # for multi-class
type.multinomial = "grouped", # treats coefficients of each class as a group
alpha = 1, # alpha=1 => LASSO penalty
nfolds = 5 # 5-fold cross-validation (adjust as needed)
)
# Plot cross-validation curves
plot(cvfit)
# Identify best lambda
best_lambda <- cvfit$lambda.min
best_lambda
## [1] 0.018
# Convert y to a Proper Factor Before Training
y <- as.factor(y)
classnames <- levels(y)
# Refit the final model at best lambda
final_model <- glmnet(
x = X,
y = y,
family = "multinomial",
alpha = 1,
lambda = cvfit$lambda.1se
)
coef.1se <- coef(final_model, s = "lambda.1se") # Get coefficients at lambda.1se
# Step 6: Retrieve the names of the selected variables
var.1se <- rownames(as.matrix(coef.1se))[-1] # Extract variable names, excluding intercept
var.1se
## [1] "INCAPACITATING INJURY" "NO INDICATION OF INJURY"
## [3] "NONINCAPACITATING INJURY"
library(randomForest)
library(caret)
set.seed(123) # for reproducibility
train_index <- createDataPartition(df_undersampled$MOST_SEVERE_INJURY, p = 0.8, list = FALSE)
df_train <- df_undersampled[train_index, ]
df_test <- df_undersampled[-train_index, ]
table(df_train$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 132 315 331
## NONINCAPACITATING INJURY
## 279
table(df_test$MOST_SEVERE_INJURY)
##
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## 33 78 82
## NONINCAPACITATING INJURY
## 69
# run the model
set.seed(123)
rf_model <- randomForest(
MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT +TRAFFIC_CONTROL_DEVICE+DEVICE_CONDITION+WEATHER_CONDITION+LIGHTING_CONDITION+FIRST_CRASH_TYPE+TRAFFICWAY_TYPE+ALIGNMENT+ROADWAY_SURFACE_COND+INTERSECTION_RELATED_I+NOT_RIGHT_OF_WAY_I+HIT_AND_RUN_I+PRIM_CONTRIBUTORY_CAUSE+CRASH_HOUR+CRASH_MONTH+LATITUDE+LONGITUDE, # target ~ all other columns
data = df_train,
ntree = 100, # number of trees
mtry = 15, # if NULL, sqrt(#predictors) for classification
importance = TRUE # track variable importance
)
# Print summary of the model
print(rf_model)
##
## Call:
## randomForest(formula = MOST_SEVERE_INJURY ~ POSTED_SPEED_LIMIT + TRAFFIC_CONTROL_DEVICE + DEVICE_CONDITION + WEATHER_CONDITION + LIGHTING_CONDITION + FIRST_CRASH_TYPE + TRAFFICWAY_TYPE + ALIGNMENT + ROADWAY_SURFACE_COND + INTERSECTION_RELATED_I + NOT_RIGHT_OF_WAY_I + HIT_AND_RUN_I + PRIM_CONTRIBUTORY_CAUSE + CRASH_HOUR + CRASH_MONTH + LATITUDE + LONGITUDE, data = df_train, ntree = 100, mtry = 15, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 56.3%
## Confusion matrix:
## FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## FATAL 35 43 19
## INCAPACITATING INJURY 18 135 83
## NO INDICATION OF INJURY 8 71 209
## NONINCAPACITATING INJURY 18 108 70
## NONINCAPACITATING INJURY class.error
## FATAL 35 0.735
## INCAPACITATING INJURY 79 0.571
## NO INDICATION OF INJURY 43 0.369
## NONINCAPACITATING INJURY 83 0.703
# Plot error vs. number of trees
plot(rf_model)
# 1. Predict classes
test_preds <- predict(rf_model, newdata = df_test)
# Confusion Matrix with test stats
confusionMatrix(
data = test_preds,
reference = df_test$MOST_SEVERE_INJURY
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FATAL INCAPACITATING INJURY NO INDICATION OF INJURY
## FATAL 7 6 9
## INCAPACITATING INJURY 12 42 26
## NO INDICATION OF INJURY 3 13 36
## NONINCAPACITATING INJURY 11 17 11
## Reference
## Prediction NONINCAPACITATING INJURY
## FATAL 5
## INCAPACITATING INJURY 39
## NO INDICATION OF INJURY 6
## NONINCAPACITATING INJURY 19
##
## Overall Statistics
##
## Accuracy : 0.397
## 95% CI : (0.337, 0.459)
## No Information Rate : 0.313
## P-Value [Acc > NIR] : 0.00245
##
## Kappa : 0.167
##
## Mcnemar's Test P-Value : 0.00137
##
## Statistics by Class:
##
## Class: FATAL Class: INCAPACITATING INJURY
## Sensitivity 0.2121 0.538
## Specificity 0.9127 0.582
## Pos Pred Value 0.2593 0.353
## Neg Pred Value 0.8894 0.748
## Prevalence 0.1260 0.298
## Detection Rate 0.0267 0.160
## Detection Prevalence 0.1031 0.454
## Balanced Accuracy 0.5624 0.560
## Class: NO INDICATION OF INJURY
## Sensitivity 0.439
## Specificity 0.878
## Pos Pred Value 0.621
## Neg Pred Value 0.775
## Prevalence 0.313
## Detection Rate 0.137
## Detection Prevalence 0.221
## Balanced Accuracy 0.658
## Class: NONINCAPACITATING INJURY
## Sensitivity 0.2754
## Specificity 0.7979
## Pos Pred Value 0.3276
## Neg Pred Value 0.7549
## Prevalence 0.2634
## Detection Rate 0.0725
## Detection Prevalence 0.2214
## Balanced Accuracy 0.5366
# Suppose your model is called rf_model
importance_matrix <- importance(rf_model)
# Convert to a data frame for plotting
library(dplyr)
# Convert to a data frame for plotting
imp_df <- data.frame(
Variable = rownames(importance_matrix),
MeanDecreaseAccuracy = importance_matrix[, "MeanDecreaseAccuracy"]
)
# Sort descending by MeanDecreaseAccuracy and keep top 20
imp_df_top20 <- imp_df %>%
arrange(desc(MeanDecreaseAccuracy)) %>%
head(20)
# Plot the variable importance
library(ggplot2)
ggplot(imp_df_top20,
aes(x = reorder(Variable, MeanDecreaseAccuracy),
y = MeanDecreaseAccuracy)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(
x = "Variable",
y = "Mean Decrease Accuracy",
title = "Top 20 Variables by Random Forest Importance"
) +
theme_minimal()